The Dataset

The first dataset considered is the Steam Video Games Dataset. This dataset is a list of user behaviors, with columns: user-id, game-title, behavior-name, value. The behaviors included are ‘purchase’ and ‘play’. The value indicates the degree to which the behavior was performed - in the case of ‘purchase’ the value is always 1, and in the case of ‘play’ the value represents the number of hours the user has played the game.

raw_data = as_tibble(
  read.csv("steam-200k.csv", header=F,
           col.names = c( "user-id","game-title", "behavior-name", "value", "unknown")
           )
  ) %>% select(-unknown)
head(raw_data)

The most played games

play_time =
  raw_data %>%
  filter(behavior.name == "play") %>%
  select(-behavior.name) %>%
  group_by(user.id) %>%
  mutate(total_time = sum(value)) %>%
  ungroup() %>%
  mutate(perc_time = value/total_time) %>%
  select(user.id, game.title, time=value, perc_time, total_time)

games_play_time = play_time %>%
  select(-user.id, -time, -perc_time) %>%
  group_by(game.title) %>%
  summarize(total_time = sum(total_time)) %>%
  ungroup() %>%
  arrange(desc(total_time)) %>%
  mutate(rnum = row_number())
considered_n = 21
to_plot = games_play_time %>% filter(rnum <= considered_n)
to_plot[considered_n,]$game.title = "Others"
to_plot[considered_n,]$total_time =
  games_play_time %>% filter(rnum >= considered_n) %>% summarize(total_time=sum(total_time)) %>% pull(total_time)

# TODO fare passaggio con bottone da un grafico all'altro
# TODO aggiungere label in cima a colonne
# TODO si puo' creare una colonna "spezzata" ad indicare che sarebbe molto piu' alta ed in cima ci metti il valore "fuori scala"?
to_plot %>%
  ggplot(aes(x=rnum, y=total_time)) +
  geom_col()

to_plot %>%
  filter(rnum != considered_n) %>%
  ggplot(aes(x=rnum, y=total_time)) +
  geom_col()

if (nrow(user_play_time) > 10) {
  perc_to_show=0.90
  to_plot = user_play_time %>%
    select(game.title, perc_time, time) %>%
    mutate(cum_perc_time=cumsum(perc_time)) %>%
    arrange(cum_perc_time) %>%
    filter(cum_perc_time <= perc_to_show)
  
  to_plot = to_plot %>%
    add_row(game.title="Others",
            perc_time=1.0 - max(to_plot$cum_perc_time),
            time = (summarize(user_play_time, sum(time)) - summarize(to_plot, sum(time))) %>% pull(),
            cum_perc_time=1.0)
} else {
  to_plot = user_play_time %>%
    select(game.title, perc_time, time) %>%
    mutate(cum_perc_time=cumsum(perc_time)) %>%
    arrange(cum_perc_time)
}

# Se il giocatore ha giocato a tantissimi giochi distribuendo il tempo
if (nrow(to_plot) > 10) {
  too_big_to_plot = to_plot %>%
    filter(game.title != "Others") %>%
    arrange(desc(perc_time)) %>%
    select(-cum_perc_time) %>%
    mutate(nrow = row_number())
  #too_big_to_plot
  
  to_plot = too_big_to_plot %>%
    filter(nrow < 8) %>%
    select(game.title, perc_time, time) %>%
    mutate(cum_perc_time=cumsum(perc_time)) %>%
    arrange(cum_perc_time)
  
  to_plot = to_plot %>%
    add_row(game.title="Others",
            perc_time=1.0 - max(to_plot$cum_perc_time),
            time = (summarize(user_play_time, sum(time)) - summarize(to_plot, sum(time))) %>% pull(),
            cum_perc_time=1.0)
  #to_plot
}
to_plot = to_plot %>%
#to_plot %>%
  mutate(label=scales::percent(perc_time)) %>%
  mutate(label=paste(label, paste(time,"h",sep=""), sep="\n")) %>%
  select(-cum_perc_time) %>%
  arrange(desc(game.title)) %>%
  mutate(lab.ypos = cumsum(perc_time) - perc_time/2)

to_plot
# TODO fare trucchetto per ordinare legenda

to_plot %>%
  ggplot(aes(x = "", y = perc_time, fill = game.title)) +
  geom_bar(width = 1, stat = "identity", color = "white") +
  coord_polar("y") +
  
  #geom_text(aes(y = lab.ypos, label = label), color = "white") +
  #geom_text(aes(y = lab.ypos, label = label), color = "black") +
  #geom_text(aes(x=rep(1.3, length(lab.ypos)), y = lab.ypos, label = label), color = "black") +
  #geom_text(aes(x=1-perc_time*.02, y = lab.ypos, label = label), color = "black") +
  #geom_text(aes(x=max(1, 1-perc_time*2), y = lab.ypos, label = label), color = "black") +
  
  #geom_text(aes(x=0.3 + (1-perc_time*.5), y = lab.ypos, label = label), color = "black") +
  geom_text(aes(x=0.3 + (1-perc_time*.6), y = lab.ypos, label = label), color = "black") +
  
  #scale_fill_manual(values= brewer.pal(n = 5, name = "RdBu")) +
  #scale_fill_manual(values= brewer.pal(n = 5, name = "Dark2")) +
  scale_fill_manual(values= brewer.pal(n=nrow(to_plot), name = "Pastel1")) +
  #scale_fill_manual(values= brewer.pal(n = 5, name = "Pastel2")) +
  labs(
    title = paste("Play Time of User", user_play_time %>% pull(user.id)),
    fill="Game Titles"
    ) + 
  theme_void()

The most bought and NOT played

# Per ogni gioco devo contare quanti l'hanno comprato
t1 = raw_data %>%
  filter(behavior.name == "purchase") %>%
  select(game.title) %>%
  group_by(game.title) %>%
  count() %>%
  ungroup() %>%
  arrange(desc(n)) %>%
  select(game.title, buy_num=n)
# Per ogni gioco devo contare quanti l'hanno giocato
t2 = raw_data %>%
  filter(behavior.name == "play") %>%
  select(game.title) %>%
  group_by(game.title) %>%
  count() %>%
  ungroup() %>%
  arrange(desc(n)) %>%
  select(game.title, play_num=n)

t1 %>%
  full_join(t2, by=c("game.title")) %>%
  replace_na(list(buy_num=0, play_num=0)) %>%
  mutate(buy_no_play=1 - play_num/buy_num) %>%
  arrange(desc(buy_no_play))

Alcuni sono DLC! In realta’ sono stati giocati perche’ basta giocare al gioco base Usiamo un altro CSV per capire quali sono veramente giochi Steam games complete dataset

raw_data_2 = as_tibble( read.csv("steam_games.csv") ) %>%
  select(-url, -recent_reviews, -all_reviews, -mature_content,
         -minimum_requirements, -recommended_requirements, -discount_price) #%>%
  # TODO se si riesce a trasformare la data in data bene altrimenti bene uguale
  #replace_na(list(release_date = "NA")) %>%
  #mutate_at(vars(release_date), ~replace(., is.nan(.), "NA")) %>%
  #mutate( across(
  #  c(release_date),
  #  function(x) {
  #    if (is.character(x) && x != "NA" && x != "NaN" ) {
  #      parse_date(x, "%b %d, %Y",locale=locale("en"))
  #    } else {
  #      return(NA) 
  #    }} )
  #  )

head(raw_data_2)
colnames(raw_data_2)
##  [1] "types"            "name"             "desc_snippet"     "release_date"    
##  [5] "developer"        "publisher"        "popular_tags"     "game_details"    
##  [9] "languages"        "achievements"     "genre"            "game_description"
## [13] "original_price"
dim(raw_data_2)
## [1] 40833    13
nrow(raw_data_2 %>% distinct(name))
## [1] 40752

Cerchiamo di capire come distinguere DLC dal resto…

raw_data_2 %>%
  filter(types == "app") %>%
  #filter(grepl("Elder Scrolls",name))
  #filter(grepl("DLC",desc_snippet))
  #filter(grepl("DLC",game_details))
  #filter(grepl("DLC",genre))
  #filter(grepl("DLC",game_description))
  filter(grepl("DLC",popular_tags))

sembra che ci siano solo giochi (alcuni contengono i DLC, GOTY Edition, etc)

# TODO REMOVE
colnames(raw_data)
## [1] "user.id"       "game.title"    "behavior.name" "value"
colnames(raw_data_2)
##  [1] "types"            "name"             "desc_snippet"     "release_date"    
##  [5] "developer"        "publisher"        "popular_tags"     "game_details"    
##  [9] "languages"        "achievements"     "genre"            "game_description"
## [13] "original_price"
games_info_raw =
  raw_data %>%
  distinct(game.title) %>%
  arrange(game.title) %>%
  left_join(raw_data_2, by=c("game.title"="name")) %>%
  group_by(game.title) %>%
  slice(1)
games_info_raw
# TODO REMOVE
#dim(raw_data %>% distinct(game.title))
#dim(games_info)

Ma di quanti abbiamo effettivamente i dati

games_info_raw %>%
  filter(!is.na(types))

meno di 2000 Questi giochi appartengono a quanti dei giocatori?

data = raw_data %>%
  right_join(
    games_info_raw %>%
      filter(!is.na(types)) %>%
      filter(release_date != "NaN") %>%
      filter(release_date != "NA") %>%
      select(game.title),
    by = "game.title"
    )

data
data %>% distinct(user.id)
data %>% distinct(game.title)
data %>% filter(behavior.name == "play")
data %>% filter(behavior.name == "purchase")

quindi lavorerei con oltre 10k persone e circa 2k giochi con oltre 90k interazioni tra user-game (di cui 35k play e 55k purchase) QUESTI SONO I VERI DATI DI PARTENZA

games_info = games_info_raw %>%
  right_join(data %>% distinct(game.title), by="game.title")
  
games_info
write.csv(games_info, "games_info.csv", row.names = F)
users_info =
  raw_data %>%
  right_join(data %>% select(user.id, game.title, behavior.name), by=c("user.id","game.title","behavior.name"))
users_info
write.csv(users_info, "users_info.csv", row.names = F)

TODO

Grafico “scatter” in cui X e’ asse temporale le palle sono i giochi il diametro e’ il (log) numero di giocatori colore potrebbe essere il genere y il numero di ore totali dei giocatori

games_info = read.csv("games_info.csv")
games_info
users_info = read.csv("users_info.csv")
users_info
games_info